Sentiment Analysis on Tweets about Ronaldo

# List of packages to install
packages_to_install <- c("hms", "lubridate", "tidytext", "tm", "wordcloud",
                         "igraph", "glue", "networkD3", "plyr", "stringr",
                         "ggplot2", "ggeasy", "plotly", "dplyr", "hms",
                         "lubridate", "magrittr", "tidyverse", "janeaustenr",
                         "widyr")

# Install packages
#chooseCRANmirror(graphics=FALSE)
#install.packages(packages_to_install)
library(hms)
library(lubridate) 
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:hms':
## 
##     hms
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidytext)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(glue)
library(networkD3)
library(plyr)
library(stringr)
## Warning: package 'stringr' was built under R version 4.2.3
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(ggeasy)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)  
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:igraph':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hms)
library(lubridate) 
library(magrittr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.0
## ✔ readr   2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ igraph::%--%()          masks lubridate::%--%()
## ✖ ggplot2::annotate()     masks NLP::annotate()
## ✖ dplyr::arrange()        masks plotly::arrange(), plyr::arrange()
## ✖ tibble::as_data_frame() masks dplyr::as_data_frame(), igraph::as_data_frame()
## ✖ purrr::compact()        masks plyr::compact()
## ✖ purrr::compose()        masks igraph::compose()
## ✖ dplyr::count()          masks plyr::count()
## ✖ tidyr::crossing()       masks igraph::crossing()
## ✖ dplyr::desc()           masks plyr::desc()
## ✖ tidyr::extract()        masks magrittr::extract()
## ✖ dplyr::failwith()       masks plyr::failwith()
## ✖ dplyr::filter()         masks plotly::filter(), stats::filter()
## ✖ lubridate::hms()        masks hms::hms()
## ✖ dplyr::id()             masks plyr::id()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ dplyr::mutate()         masks plotly::mutate(), plyr::mutate()
## ✖ dplyr::rename()         masks plotly::rename(), plyr::rename()
## ✖ purrr::set_names()      masks magrittr::set_names()
## ✖ purrr::simplify()       masks igraph::simplify()
## ✖ dplyr::summarise()      masks plotly::summarise(), plyr::summarise()
## ✖ dplyr::summarize()      masks plyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janeaustenr)
library(widyr)

file_path <- "data/Cleaned_ronaldo_tweets.csv"

tweets_df <- read.csv(file_path)

str(tweets_df)
## 'data.frame':    501926 obs. of  10 variables:
##  $ tweet_id     : num  1.55e+18 1.55e+18 1.55e+18 1.55e+18 1.55e+18 ...
##  $ author_id    : num  1.41e+18 1.33e+18 1.19e+18 7.27e+17 1.32e+18 ...
##  $ content      : chr  "we fell down to with ronaldo and no big team who wins trophies want him yet you want him at to be our main stri"| __truncated__ "man utd transfer news live frenkie de jong final bid latest cristiano ronaldo admission tielemans interested" "r and ronaldinho make me smile when i saw than on the field stats of course cr but loving football r and ronaldinho" "ronaldo was in the best champions league team ever winning in a row poor pessi was losing and getting sacked fr"| __truncated__ ...
##  $ lang         : chr  "en" "en" "en" "en" ...
##  $ date         : chr  "2022-08-02T07:34:06.000Z" "2022-08-02T07:34:00.000Z" "2022-08-02T07:33:40.000Z" "2022-08-02T07:33:38.000Z" ...
##  $ source       : chr  "Twitter for Android" "Publer.io" "Twitter for iPhone" "Twitter for Android" ...
##  $ geo          : chr  "-1" "-1" "-1" "-1" ...
##  $ retweet_count: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ like_count   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ quote_count  : num  0 0 0 0 0 0 0 0 0 0 ...
# load sentiment 
positive = scan('data/resources/positive-words.txt', what = 'character', comment.char = ';')

negative = scan('data/resources/negative-words.txt', what = 'character', comment.char = ';')
# add your list of words below as you wish if missing in above read lists
pos.words = c(positive,'upgrade','Congrats','prizes','prize','thanks','thnx',
              'Grt','gr8','plz','trending','recovering','brainstorm','leader')

neg.words = c(negative,'wtf','wait','waiting','epicfail','Fight','fighting',
              'arrest','no','not')
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') {
  require(plyr)
  require(stringr)
  
  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    
    # convert to lower case:
    sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    
    # TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    
    return(score)
  }, pos.words, neg.words, .progress=.progress )
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

chunk_size <- 1000  # Adjust the size based on your preference

# Get the number of chunks
num_chunks <- ceiling(nrow(tweets_df) / chunk_size)

print(num_chunks)
## [1] 502
library(plotly)

cleanText <- tweets_df$content
analysis <- score.sentiment(cleanText, pos.words, neg.words)
table(analysis$score)
## 
##    -15    -13    -10     -9     -8     -7     -6     -5     -4     -3     -2 
##      1      3     19     24     59    158    431   1126   3259   9398  28652 
##     -1      0      1      2      3      4      5      6      7      8      9 
##  83272 208448 105790  38064  14860   5216   1964    755    289     84     26 
##     10     11     12     13     14     17 
##     14      6      4      2      1      1
# plot of sentiment frequencies
analysis %>%
  ggplot(aes(x=score)) +
  geom_histogram(binwidth = 1, fill = "lightblue")+
  ylab("Frequency") +
  xlab("sentiment score") +
  ggtitle("Distribution of Sentiment scores of the tweets") +
  ggeasy::easy_center_title()

library(plotly)

# Convert 'date' variable to datetime format
tweets_df$date <- lubridate::as_datetime(tweets_df$date)

# Extract date and sentiment score
date_sentiment <- data.frame(date = tweets_df$date, sentiment = analysis$score)

# Plot sentiment over time
date_sentiment %>%
  ggplot(aes(x = date, y = sentiment)) +
  geom_line() +
  ylab("Sentiment Score") +
  xlab("Date") +
  ggtitle("Sentiment Over Time") +
  ggeasy::easy_center_title()

# Plot sentiment over time using Plotly
plot_ly(date_sentiment, x = ~date, y = ~sentiment, type = "scatter", mode = "lines") %>%
  layout(
    yaxis = list(title = "Sentiment Score"),
    xaxis = list(title = "Date"),
    title = "Sentiment Over Time"
  )
# Extract month and year from the 'date' column
tweets_df$month_year <- format(tweets_df$date, "%Y-%m")

# Count the number of tweets for each month and year
tweet_counts <- table(tweets_df$month_year)

# Identify time frames with tweets
time_frames_with_tweets <- names(tweet_counts[tweet_counts > 0])

# Print the identified time frames
#cat("Time frames with tweets:", time_frames_with_tweets, "\n")

# Extract unique dates with tweets
dates_with_tweets <- unique(tweets_df$date)

# Sort the dates
sorted_dates <- sort(dates_with_tweets)

#print(sorted_dates)

unique_day_month_dates <- unique(format(tweets_df$date, "%d, %b"))

# Print the list of unique day, month dates with tweets
cat("Unique day, month dates with tweets:", unique_day_month_dates, "\n")
## Unique day, month dates with tweets: 02, Aug 01, Aug 31, Jul 23, Jun 22, Jun 21, Jun 20, Jun 19, Jun 18, Jun 17, Jun 16, Jun 06, Jun 05, Jun 04, Jun 27, May 26, May 25, May 24, May 23, May 19, May 18, May 17, May 13, May 12, May 09, May 08, May 07, May 06, May 05, May 04, May 02, May 01, May 30, Apr 27, Apr 26, Apr 25, Apr 20, Apr 17, Apr 16, Apr 15, Apr 14, Apr 13, Apr 12, Apr 11, Apr 10, Apr 09, Apr 18, Apr
# Print the list of consecutive days
#cat("Consecutive days with tweets:", consecutive_days, "\n")